# Course: BUAN 5210
# Title: Technical Appendix
# Purpose: Suggetion to motivae people to buy the assets managment or retirement product based on Basic EDA and detailed EDA
# Date: Mar 14th, 2019
# Author: Ying Xue
# Clear environment of variables and functions
rm(list = ls(all = TRUE))
# library package
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.0 ✔ readr 1.3.1
## ✔ tibble 2.1.3 ✔ purrr 0.3.2
## ✔ tidyr 0.8.3 ✔ stringr 1.4.0
## ✔ ggplot2 3.2.0 ✔ forcats 0.4.0
## ── Conflicts ────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(stringr)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
##
## Attaching package: 'GGally'
## The following object is masked from 'package:dplyr':
##
## nasa
library(htmlTable)
library(kableExtra)
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
library(janitor)
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(shiny)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
library(mfx)
## Loading required package: sandwich
## Loading required package: lmtest
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:plotly':
##
## select
## The following object is masked from 'package:dplyr':
##
## select
## Loading required package: betareg
library(MASS)
# load data
raw_data <- read_csv("Personal_needs.csv")
# Data Manipulation
# select the columns of interest
research_data <-dplyr::select(raw_data,FinancialSecurity,tv,intrnet,socmed,charit,excer,garden,impulse,eatout,pubtran,sportfan,commun,envichoi,brandloyal,gender,age,jobstatus,Hhincome,retirement,educ,marstatus)
# remove the rows of na
tidy_table <- research_data %>% na.omit()
# convert character into factor
tidy_table <- tidy_table %>% mutate(
gender = as.factor(gender),
jobstatus= as.factor(jobstatus),
educ = as.factor(educ),
marstatus = as.factor(marstatus),
Hhincome = as.factor(Hhincome),
# rename the category within the Hhincome according to the Census Bureau 2017
Cat_Hhincome = case_when(
Hhincome == "$20,000-$39,999"| Hhincome == "Below $20,000"~"Pov/Low",
Hhincome == "$40,000-$59,999"|Hhincome == "$40,000-$59,999"|Hhincome == "$60,000-$79,999"|Hhincome == "$80,000-$99,999"|Hhincome == "$100,000-$149,999" ~ "Middle",
Hhincome == "$150,000-$199,999"|Hhincome == "$200,000 & above"~"High"),
Cat_Hhincome = as.factor(Cat_Hhincome),
# rename the category within the retirement
retirement = case_when(
retirement == "$100,000-$149,999" ~ "100k-150k",
retirement == "$20,000-$59,999"~"20k-60k",
retirement == "$200,000-$299,999" ~ "200k-300k",
retirement == "$300,000-$399,999" ~"300k-400k",
retirement == "$400,000-$499,999" ~ "400k-500k",
retirement == "$500,000 and above" ~ ">=500k",
retirement == "$60,000-$99,999" ~ "60k-100k",
retirement == "below $20,000" ~ "<20k",TRUE ~ "NO"),
Cat_retirement= case_when(retirement == "NO" ~ "NO", TRUE~"YES"),
retirement = as.factor(retirement), Cat_retirement = as.factor( Cat_retirement))
# convert the age into category
tidy_table$Cat_age<- cut(tidy_table$age, breaks=c(0,20, 30, 40, 50, 60 ,70,100), right = FALSE,labels=c("<20","20s","30s","40s","50s","60s",">=70"))
# convert FinancialSecurity Motivation into category
tidy_table$Cat_Fin<- cut(tidy_table$FinancialSecurity, breaks=c(2,4,5,8,9,10), right = FALSE,labels=c("not important","slightly important","moderate important","very important","extrem important"))
str(tidy_table)
## Classes 'tbl_df', 'tbl' and 'data.frame': 366 obs. of 25 variables:
## $ FinancialSecurity: num 8 7.67 6.33 8.33 8.33 ...
## $ tv : num 0 4 10 6 5 20 2 2 14 0 ...
## $ intrnet : num 2 5 7 3 2 10 1 2 5 25 ...
## $ socmed : num 0 1 0 1 0 2 0.5 0.5 2 5 ...
## $ charit : num 250 0 2000 2500 500 500 12000 0 200 400 ...
## $ excer : num 2 3 6 9 18 1 5 2 4 7 ...
## $ garden : num 0 0 6 4 2 0 0 0 0 2 ...
## $ impulse : num 300 500 2000 200 2000 2500 100 200 300 2500 ...
## $ eatout : num 10 5 4 1 15 30 12 6 2 1 ...
## $ pubtran : num 80 0 40 40 0 0 0 30 6 10 ...
## $ sportfan : num 1 7 5 6 4 5 1 5 5 3 ...
## $ commun : num 2 4 5 6 4 5 4 3 1 7 ...
## $ envichoi : num 3 5 6 6 7 6 4 5 5 7 ...
## $ brandloyal : num 2 5 2 1 1 7 5 3 5 1 ...
## $ gender : Factor w/ 2 levels "Female","Male": 2 2 1 1 1 1 1 1 1 1 ...
## $ age : num 55 39 61 63 40 34 30 23 28 36 ...
## $ jobstatus : Factor w/ 5 levels "Administration",..: 5 2 2 1 2 2 4 4 4 4 ...
## $ Hhincome : Factor w/ 8 levels "$100,000-$149,999",..: 3 7 2 5 5 1 1 8 6 8 ...
## $ retirement : Factor w/ 9 levels "<20k",">=500k",..: 1 5 9 2 8 5 6 9 1 5 ...
## $ educ : Factor w/ 6 levels "college graduate",..: 2 2 2 2 2 2 2 1 1 1 ...
## $ marstatus : Factor w/ 6 levels "Divorced","Married",..: 5 2 2 2 5 2 2 5 2 1 ...
## $ Cat_Hhincome : Factor w/ 3 levels "High","Middle",..: 3 2 1 2 2 2 2 3 2 3 ...
## $ Cat_retirement : Factor w/ 2 levels "NO","YES": 2 2 1 2 2 2 2 1 2 2 ...
## $ Cat_age : Factor w/ 7 levels "<20","20s","30s",..: 5 3 6 6 4 3 3 2 2 3 ...
## $ Cat_Fin : Factor w/ 5 levels "not important",..: 4 3 3 4 4 3 5 4 3 1 ...
summary(tidy_table)
## FinancialSecurity tv intrnet socmed
## Min. :2.000 Min. : 0.00 Min. : 1.00 Min. : 0.000
## 1st Qu.:6.333 1st Qu.: 5.00 1st Qu.: 5.00 1st Qu.: 1.000
## Median :7.333 Median :10.00 Median :12.00 Median : 2.000
## Mean :7.023 Mean :15.51 Mean :15.82 Mean : 4.198
## 3rd Qu.:8.000 3rd Qu.:21.00 3rd Qu.:20.00 3rd Qu.: 5.000
## Max. :9.000 Max. :80.00 Max. :77.00 Max. :40.000
##
## charit excer garden impulse
## Min. : 0.0 Min. : 0.000 Min. : 0.000 Min. : 0.00
## 1st Qu.: 10.0 1st Qu.: 2.000 1st Qu.: 0.000 1st Qu.: 61.25
## Median : 100.0 Median : 4.000 Median : 0.000 Median : 200.00
## Mean : 791.1 Mean : 4.773 Mean : 1.777 Mean : 443.37
## 3rd Qu.: 500.0 3rd Qu.: 7.000 3rd Qu.: 2.000 3rd Qu.: 500.00
## Max. :15000.0 Max. :30.000 Max. :21.000 Max. :5000.00
##
## eatout pubtran sportfan commun
## Min. : 0.00 Min. : 0.000 Min. :1.000 Min. :1.000
## 1st Qu.: 2.00 1st Qu.: 0.000 1st Qu.:2.000 1st Qu.:4.000
## Median : 3.00 Median : 0.000 Median :5.000 Median :4.500
## Mean : 4.53 Mean : 4.612 Mean :4.216 Mean :4.415
## 3rd Qu.: 5.00 3rd Qu.: 3.000 3rd Qu.:6.000 3rd Qu.:6.000
## Max. :30.00 Max. :90.000 Max. :7.000 Max. :7.000
##
## envichoi brandloyal gender age
## Min. :1.000 Min. :1.000 Female:251 Min. :18.00
## 1st Qu.:4.000 1st Qu.:3.000 Male :115 1st Qu.:28.00
## Median :5.000 Median :4.000 Median :43.00
## Mean :4.989 Mean :3.995 Mean :43.22
## 3rd Qu.:6.000 3rd Qu.:5.000 3rd Qu.:57.00
## Max. :7.000 Max. :7.000 Max. :88.00
##
## jobstatus Hhincome retirement
## Administration: 6 $20,000-$39,999 :81 NO :158
## Faculty : 38 Below $20,000 :79 <20k : 78
## Non-university:211 $40,000-$59,999 :65 20k-60k : 36
## Staff : 80 $80,000-$99,999 :44 100k-150k: 23
## Student : 31 $100,000-$149,999:42 >=500k : 21
## $60,000-$79,999 :40 60k-100k : 20
## (Other) :15 (Other) : 30
## educ marstatus Cat_Hhincome Cat_retirement
## college graduate :121 Divorced : 31 High : 15 NO :158
## graduate degree : 84 Married :165 Middle :191 YES:208
## high school graduate: 43 Other : 4 Pov/Low:160
## other : 3 Partnered: 49
## some college :111 single :108
## some high school : 4 Widowed : 9
##
## Cat_age Cat_Fin
## <20 : 9 not important : 13
## 20s :92 slightly important: 18
## 30s :63 moderate important:210
## 40s :66 very important : 86
## 50s :61 extrem important : 39
## 60s :56
## >=70:19
#Create function for frequency tables
count_table <- function(x,colname){
x = enquo(x)
kable(
tidy_table %>%
tabyl(!!x) %>%
adorn_totals()%>%
adorn_pct_formatting(digits = 0 ),
digits = 2,
format = "html",
align = c("l","c","c"),
col.names = c(colname,"Count","Total")
)%>%
kable_styling(full_width = F)}
#Make count tables for univariate variables for segmentation
count_table(Cat_Fin,"Value of Financial Security")
| Value of Financial Security | Count | Total |
|---|---|---|
| not important | 13 | 4% |
| slightly important | 18 | 5% |
| moderate important | 210 | 57% |
| very important | 86 | 23% |
| extrem important | 39 | 11% |
| Total | 366 | 100% |
count_table(Cat_age,"Age")
| Age | Count | Total |
|---|---|---|
| <20 | 9 | 2% |
| 20s | 92 | 25% |
| 30s | 63 | 17% |
| 40s | 66 | 18% |
| 50s | 61 | 17% |
| 60s | 56 | 15% |
| >=70 | 19 | 5% |
| Total | 366 | 100% |
count_table(Cat_Hhincome,"Household Income")
| Household Income | Count | Total |
|---|---|---|
| High | 15 | 4% |
| Middle | 191 | 52% |
| Pov/Low | 160 | 44% |
| Total | 366 | 100% |
count_table(retirement,"Retirment Plan")
| Retirment Plan | Count | Total |
|---|---|---|
| <20k | 78 | 21% |
| >=500k | 21 | 6% |
| 100k-150k | 23 | 6% |
| 200k-300k | 14 | 4% |
| 20k-60k | 36 | 10% |
| 300k-400k | 13 | 4% |
| 400k-500k | 3 | 1% |
| 60k-100k | 20 | 5% |
| NO | 158 | 43% |
| Total | 366 | 100% |
count_table(marstatus,"Marital Status")
| Marital Status | Count | Total |
|---|---|---|
| Divorced | 31 | 8% |
| Married | 165 | 45% |
| Other | 4 | 1% |
| Partnered | 49 | 13% |
| single | 108 | 30% |
| Widowed | 9 | 2% |
| Total | 366 | 100% |
count_table(gender,"Gender")
| Gender | Count | Total |
|---|---|---|
| Female | 251 | 69% |
| Male | 115 | 31% |
| Total | 366 | 100% |
count_table(educ,"Education Attainment")
| Education Attainment | Count | Total |
|---|---|---|
| college graduate | 121 | 33% |
| graduate degree | 84 | 23% |
| high school graduate | 43 | 12% |
| other | 3 | 1% |
| some college | 111 | 30% |
| some high school | 4 | 1% |
| Total | 366 | 100% |
We begin with exploration of the categorical variables.
# Code histograms using grid.arrange so can see all variables together
grid.arrange(
# distribution by retirement
tidy_table %>%
ggplot(aes(retirement))+
geom_bar() +
theme(axis.text.x = element_text(angle=60, hjust=1)),
tidy_table %>%
ggplot(aes(Cat_retirement))+
geom_bar() +
theme(axis.text.x = element_text(angle=60, hjust=1)),
# distribution by value of Financial Security
tidy_table %>%
ggplot(aes(Cat_Fin))+
geom_bar() +
theme(axis.text.x = element_text(angle=60, hjust=1)),
ncol=2
)
# Demographical Information
grid.arrange(
# distribution by gender
tidy_table %>%
ggplot(aes(gender))+
geom_bar() +
theme(axis.text.x = element_text(angle=60, hjust=1)),
# distribution by jobstatus
tidy_table %>%
ggplot(aes(jobstatus)) +
geom_bar() +
theme(axis.text.x = element_text(angle=60, hjust=1)),
# distribution by educ
tidy_table %>%
ggplot(aes(educ))+
geom_bar() +
theme(axis.text.x = element_text(angle=60, hjust=1)),
# distribution by marstatus
tidy_table %>%
ggplot(aes(marstatus))+
geom_bar() +
theme(axis.text.x = element_text(angle=60, hjust=1)),
# distribution by Hhincome
tidy_table %>%
ggplot(aes(Cat_Hhincome))+
geom_bar() +
theme(axis.text.x = element_text(angle=60, hjust=1)),
# distribution by Age
tidy_table %>%
ggplot(aes(Cat_age))+
geom_bar() +
theme(axis.text.x = element_text(angle=60, hjust=1)),
ncol=2
)
Having completed our examination of the categorical variables, we move on to the numeric variables.
# Code histograms using grid.arrange so can see all quant variables together
grid.arrange(
# Value of Financial Security distribution
tidy_table %>%
ggplot(aes(FinancialSecurity)) +
geom_histogram(),
# Attitude of Sportfan distribution
tidy_table %>%
ggplot(aes(sportfan)) +
geom_histogram(),
# Attitude of Community connected distribution
tidy_table %>%
ggplot(aes(commun)) +
geom_histogram(),
# Attitude of Environment Friendly distribution
tidy_table %>%
ggplot(aes(envichoi)) +
geom_histogram(),
# Behavior of money on charities
tidy_table %>%
ggplot(aes(charit)) +
geom_histogram(),
# Behavior of time on whatching TV distribution
tidy_table %>%
ggplot(aes(tv)) +
geom_histogram(),
# Behavior of time on surfing internet distribution
tidy_table %>%
ggplot(aes(intrnet)) +
geom_histogram(),
# Behavior of time on social media distribution
tidy_table %>%
ggplot(aes(socmed)) +
geom_histogram(),
# Behavior of time on exercise distribution
tidy_table %>%
ggplot(aes(excer)) +
geom_histogram()
)
First, we examine overall characteristics using cross-tabs and table
# education and retirement plan balance
tidy_table %>%
tabyl(educ,Cat_retirement) %>%
adorn_totals(where = c("row", "col")) %>%
adorn_percentages(denominator = "all") %>%
adorn_pct_formatting(digits = 0)
## educ NO YES Total
## college graduate 11% 22% 33%
## graduate degree 5% 17% 23%
## high school graduate 8% 4% 12%
## other 0% 1% 1%
## some college 18% 13% 30%
## some high school 1% 1% 1%
## Total 43% 57% 100%
# gender and retirement plan balance
tidy_table %>%
tabyl(gender,retirement) %>%
adorn_totals(where = c("row", "col")) %>%
adorn_percentages(denominator = "all") %>%
adorn_pct_formatting(digits = 0)
## gender <20k >=500k 100k-150k 200k-300k 20k-60k 300k-400k 400k-500k
## Female 15% 3% 5% 2% 6% 3% 0%
## Male 6% 2% 2% 2% 4% 1% 1%
## Total 21% 6% 6% 4% 10% 4% 1%
## 60k-100k NO Total
## 4% 30% 69%
## 1% 13% 31%
## 5% 43% 100%
# household income and retirement plan balance
tidy_table %>%
tabyl(Cat_Hhincome,Cat_retirement) %>%
adorn_totals(where = c("row", "col")) %>%
adorn_percentages(denominator = "all") %>%
adorn_pct_formatting(digits = 0)
## Cat_Hhincome NO YES Total
## High 1% 3% 4%
## Middle 14% 38% 52%
## Pov/Low 28% 16% 44%
## Total 43% 57% 100%
# marital status and Value of Financial Security
tidy_table %>%
tabyl(marstatus,Cat_Fin) %>%
adorn_totals(where = c("row", "col")) %>%
adorn_percentages(denominator = "all") %>%
adorn_pct_formatting(digits = 0)
## marstatus not important slightly important moderate important
## Divorced 1% 0% 4%
## Married 1% 2% 28%
## Other 0% 0% 1%
## Partnered 1% 1% 8%
## single 1% 2% 17%
## Widowed 0% 0% 0%
## Total 4% 5% 57%
## very important extrem important Total
## 3% 1% 8%
## 10% 4% 45%
## 1% 0% 1%
## 3% 1% 13%
## 6% 3% 30%
## 1% 1% 2%
## 23% 11% 100%
# marital status and retirement plan balance
tidy_table %>%
tabyl(marstatus,Cat_retirement) %>%
adorn_totals(where = c("row", "col")) %>%
adorn_percentages(denominator = "all") %>%
adorn_pct_formatting(digits = 0)
## marstatus NO YES Total
## Divorced 4% 5% 8%
## Married 13% 32% 45%
## Other 1% 1% 1%
## Partnered 8% 5% 13%
## single 16% 13% 30%
## Widowed 1% 2% 2%
## Total 43% 57% 100%
Now, we move on to understanding covariance graphically and to understanding the relationships among numeric variables.
# Use tile graph to show which group value Financial Security most
ggplotly(p<- tidy_table %>%
group_by(gender,Cat_Hhincome) %>%
summarise(Fina = mean(FinancialSecurity)) %>%
ggplot(aes(gender,Cat_Hhincome)) +
geom_tile(aes(fill = -Fina)))
ggplotly(p<- tidy_table %>%
group_by(marstatus,gender) %>%
summarise(Fina = mean(FinancialSecurity)) %>%
ggplot(aes(marstatus,gender)) +
geom_tile(aes(fill = -Fina)))
ggplotly(p<- tidy_table %>%
group_by(marstatus,Cat_age) %>%
summarise(Fina = mean(FinancialSecurity)) %>%
ggplot(aes(marstatus,Cat_age)) +
geom_tile(aes(fill = -Fina)))
ggplotly(p <- tidy_table %>%
group_by(educ,marstatus) %>%
summarise(Fina = mean(FinancialSecurity)) %>%
ggplot(aes(educ, marstatus)) +
theme(axis.text.x = element_text(angle=60, hjust=1)) +
geom_tile(aes(fill = -Fina)))
+ Finding of the data: ++ The following segments of people value Financial Security most: Middleincome Female 7.27, WidowFemale 7.37, Divorced Male 7.73, 60s widowed 9, widowed with graduate degree 8.5 with highschool degree 9
# Use cor to get the correlation between the variable with interest
pairs_data<-tidy_table%>%dplyr::select(FinancialSecurity,tv,intrnet,socmed,charit,excer,sportfan,commun,envichoi,gender,age,Cat_Hhincome,marstatus)
ggpairs(pairs_data)
From these graphs, it’s unclear which targets should be made and how the marketing should be positioned, I will research this further in the detailed EDA.
# Boxplots of Finacialsecurity by gender, marital status, householdincome and education
grid.arrange(
# Finacialsecurity by gender
tidy_table %>%
ggplot(aes(x = gender, y = FinancialSecurity)) +
geom_boxplot() +
coord_flip(),
# Finacialsecurity by marital status
tidy_table %>%
ggplot(aes(x = marstatus, y = FinancialSecurity)) +
geom_boxplot() +
coord_flip(),
# Finacialsecurity by Hhincome
tidy_table %>%
ggplot(aes(x = Cat_Hhincome, y = FinancialSecurity)) +
geom_boxplot() +
coord_flip(),
# Finacialsecurity by Education
tidy_table %>%
ggplot(aes(x = educ, y = FinancialSecurity)) +
geom_boxplot() +
coord_flip(),
ncol = 2
)
For this section of the exploration, we examine the statistical validity of the most interesting findings discussed in the basic EDA.
Question1: Is the participation of the retirement plan related to the higher value of Financial Security?
g1 <- tidy_table %>%
group_by(Cat_Fin,Cat_retirement) %>%
summarise(count= n()) %>%
ggplot(aes(x = Cat_Fin, y = count, fill = Cat_retirement)) +
geom_bar(stat = "identity", position = "fill") +
labs(fill = "retirement plan", y = "Percentage") +
coord_flip() +
scale_y_continuous(labels = percent_format(accuracy = 1, suffix = "%")) +
theme_classic() +
guides(fill = guide_legend(reverse = TRUE)) +
theme(legend.position = "top",
legend.justification = "center",
legend.title = element_text(face = "bold"),
axis.title.y = element_blank(),
axis.ticks.y = element_blank(),
axis.line.y = element_blank()) +
scale_fill_brewer(palette = "Blues") +
ggtitle("Retirement Plan Distribution By value of Financial Security\nTarget at the people with high value of Financial Security")
g1
Question2: Is the participation of the retirement plan related to the marital status?
# remove "other" information in marital status
g2 <- tidy_table %>%filter(marstatus!="Other"&gender=="Female")%>%
group_by(marstatus,Cat_retirement) %>%
summarise(count= n()) %>%
ggplot(aes(x = marstatus, y = count, fill = Cat_retirement)) +
geom_bar(stat = "identity", position = "fill") +
labs(fill = "retirement plan",y = "Percentage") +
coord_flip() +
scale_y_continuous(labels = percent_format(accuracy = 1, suffix = "%")) +
theme_classic() +
guides(fill = guide_legend(reverse = TRUE)) +
theme(legend.position = "top",
legend.justification = "center",
legend.title = element_text(face = "bold"),
axis.title.y = element_blank(),
axis.ticks.y = element_blank(),
axis.line.y = element_blank()) +
scale_fill_brewer(palette = "Blues") +
ggtitle("Retirement Plan Distribution By Marital Status",
subtitle = "Target at the people for widowed or married people")
g2
Question3: Is the participation of the retirement plan related to age?
g3 <-
tidy_table %>%
group_by(Cat_age, Cat_retirement) %>%
summarise(count= n()) %>%
ggplot(aes(x = Cat_age, y = count, fill = Cat_retirement)) +
geom_bar(stat = "identity", position = "fill") +
labs(fill = "retirement plan", y = "Percentage") +
coord_flip() +
scale_y_continuous(labels = percent_format(accuracy = 1, suffix = "%")) +
theme_classic() +
guides(fill = guide_legend(reverse = TRUE)) +
theme(legend.position = "top",
legend.justification = "center",
legend.title = element_text(face = "bold"),
axis.title.y = element_blank(),
axis.ticks.y = element_blank(),
axis.line.y = element_blank()) +
scale_x_discrete(limits=c("<20","20s",">=70","40s","30s","60s","50s")) +
scale_fill_brewer(palette = "Blues") +
ggtitle("Retirement Plan Distribution By Age\nTarget at people in 50s and 60s")
g3
Question4: Is the participation of the retirement plan related to age?
g4 <-
tidy_table %>%
group_by(educ, Cat_retirement,gender) %>%
summarise(count= n()) %>%
ggplot(aes(x = educ, y = count, fill = Cat_retirement)) +
geom_bar(stat = "identity", position = "fill") +
labs(fill = "retirement plan", y = "Percentage") +
coord_flip() +
scale_y_continuous(labels = percent_format(accuracy = 1, suffix = "%")) +
theme_classic() +
guides(fill = guide_legend(reverse = TRUE)) +
theme(legend.position = "top",
legend.justification = "center",
legend.title = element_text(face = "bold"),
axis.title.y = element_blank(),
axis.ticks.y = element_blank(),
axis.line.y = element_blank()) +
scale_fill_brewer(palette = "Blues") +
ggtitle("Retirement Plan Distribution By Education",
subtitle = "higher edcuation maybe more likely to join the plan")
g4
Question5: How Value of Financial Security differentiated by age?
g5 <-tidy_table %>%
group_by(Cat_age, Cat_Fin) %>%
summarise(count= n()) %>%
ggplot(aes(x = Cat_age, y = count, fill =Cat_Fin)) +
geom_bar(stat = "identity", position = "fill") +
labs(fill = "Value", y = "Percentage") +
coord_flip() +
scale_y_continuous(labels = percent_format(accuracy = 1, suffix = "%")) +
theme_classic() +
guides(fill = guide_legend(reverse = TRUE)) +
theme(legend.position = "top",
legend.justification = "center",
legend.title = element_text(face = "bold"),
axis.title.y = element_blank(),
axis.ticks.y = element_blank(),
axis.line.y = element_blank()) +
scale_fill_brewer(palette = "Blues") +
scale_x_discrete(limits=c("<20",">=70","20s","30s","40s","60s","50s")) +
ggtitle("Value of Financial Security By Age\n 50s is a good target - more than 98% think it important")
g5
Question6: How Value of Financial Security differentiated by marital status?
g6 <-tidy_table %>%
group_by(marstatus, Cat_Fin) %>%filter(marstatus!="Other")%>%
summarise(count= n()) %>%
ggplot(aes(x = marstatus, y = count, fill =Cat_Fin)) +
geom_bar(stat = "identity", position = "fill") +
labs(fill = "Value", y = "Percentage") +
scale_y_continuous(labels = percent_format(accuracy = 1, suffix = "%")) +
theme_classic() +
guides(fill = guide_legend(reverse = TRUE)) +
theme(legend.position = "right",
legend.justification = "center",
legend.title = element_text(face = "bold"),
axis.title.y = element_blank(),
axis.ticks.y = element_blank(),
axis.line.y = element_blank()) +
scale_fill_brewer(palette = "Blues") +
ggtitle("Value of Financial Security By Marital Status\nwidowed peopleis a good target- most of them think it extrem or very important")+
scale_x_discrete(limits=c("Widowed","Divorced","Married","single","Partnered"))
g6
Question7: How Value of Financial Security differentiated by household income?
g7 <-tidy_table %>%
group_by(Cat_Hhincome, Cat_Fin) %>%
summarise(count= n()) %>%
ggplot(aes(x = Cat_Hhincome, y = count, fill =Cat_Fin)) +
geom_bar(stat = "identity", position = "fill") +
labs(fill = "Value", y = "Percentage") +
coord_flip() +
scale_y_continuous(labels = percent_format(accuracy = 1, suffix = "%")) +
theme_classic() +
guides(fill = guide_legend(reverse = TRUE)) +
theme(legend.position = "right",
legend.justification = "center",
legend.title = element_text(face = "bold"),
axis.title.y = element_blank(),
axis.ticks.y = element_blank(),
axis.line.y = element_blank()) +
scale_fill_brewer(palette = "Blues") +
ggtitle("Value of Financial Security By Household Income",
subtitle = "Most widowed people value Financial Security extrem or very imporance")
g7
Question8: how participation of the retirement plan related to both the Householdincome and marital status?
g8 <- tidy_table %>% filter(marstatus!="Other") %>%
group_by(Cat_Hhincome,marstatus,Cat_retirement) %>%
summarise(count = n()) %>%
ggplot(aes(x = marstatus, y = count, fill=Cat_retirement)) +
geom_bar(stat = "identity", position = "stack") +
theme_classic() +
coord_flip() +
labs(fill = "Retirement Plan",y="count") +
theme(axis.text = element_text(face = "bold", size = 9),
axis.title = element_blank(),
axis.ticks.y = element_blank(),
axis.line.y = element_blank(),
axis.line.x = element_blank(),
legend.position = "right",
legend.title = element_text(face = "bold")) +
ggtitle("participation of the retirement plan by household income and marital status\n married people with Middlehousehold income is a large potential ") +
scale_fill_brewer() +
facet_grid(Cat_Hhincome ~ .)
g8
Question9: Is the participation of the retirement plan related to both the age and marital status?
g9 <- tidy_table %>% filter(Cat_age!="<20",marstatus!="Other") %>%
group_by(Cat_age,marstatus,Cat_retirement) %>%
summarise(count = n()) %>%
mutate(total = sum(count)) %>%
mutate(percent = count/total) %>%
ggplot(aes(x = marstatus, y = percent, fill=Cat_retirement)) +
geom_bar(stat = "identity", position = "fill") +
theme_classic() +
coord_flip() +
labs(fill = "Retirement Plan") +
scale_y_continuous(labels = percent_format(accuracy = 1, suffix = "%")) +
theme(axis.text = element_text(face = "bold", size = 9),
axis.title = element_blank(),
axis.ticks.y = element_blank(),
axis.line.y = element_blank(),
axis.line.x = element_blank(),
legend.position = "top",
legend.title = element_text(face = "bold")) +
ggtitle("participation of the retirement plan by age and marital status",
subtitle = "100% participation for 50s,60s widowed & 30s,70s divorced people") +
scale_fill_brewer() +
facet_wrap(.~ Cat_age)
g9
# remove age under 20, because 21 is the leagal age to join retirement plan
z <- qnorm(.95)
q_g1 <- tidy_table %>%
group_by(marstatus, Cat_retirement) %>%
summarise(Fina = mean(FinancialSecurity), sd = sd(FinancialSecurity),
n = n(), ci = z * sd/sqrt(n)) %>%
ggplot(aes(x = marstatus, y = Fina, fill = Cat_retirement)) +
geom_bar(stat = "identity", position = "dodge") +
coord_flip() +
geom_errorbar(aes(ymin = Fina - ci, ymax = Fina + ci),
width = 0.5, position = position_dodge(0.9)) +
theme_classic() +
labs(fill = "Retirement Plan") +
theme(axis.text = element_text(face = "bold", size = 11),
axis.title = element_blank(),
axis.ticks.y = element_blank(),
axis.line.y = element_blank(),
axis.line.x = element_line(colour = "grey"),
legend.position = "top",
legend.title = element_text(face = "bold")) +
scale_fill_brewer() +
ggtitle("Average Financial Security by Marital Status and Retirement Plan Participation",
subtitle = "95% Confidence Interval ")
q_g1
# remove age under 20, because 21 is the leagal age to join retirement plan
z <- qnorm(.95)
q_g2 <- tidy_table %>% filter(Cat_age!="<20") %>%
group_by(Cat_age, Cat_retirement) %>%
summarise(Fina = mean(FinancialSecurity), sd = sd(FinancialSecurity),
n = n(), ci = z * sd/sqrt(n)) %>%
ggplot(aes(x = Cat_age, y = Fina, fill = Cat_retirement)) +
geom_bar(stat = "identity", position = "dodge") +
coord_flip() +
geom_errorbar(aes(ymin = Fina - ci, ymax = Fina + ci),
width = 0.5, position = position_dodge(0.9)) +
theme_classic() +
labs(fill = "Retirement Plan") +
theme(axis.text = element_text(face = "bold", size = 11),
axis.title = element_blank(),
axis.ticks.y = element_blank(),
axis.line.y = element_blank(),
axis.line.x = element_line(colour = "grey"),
legend.position = "top",
legend.title = element_text(face = "bold")) +
scale_fill_brewer() +
ggtitle("Average Financial Security by Age and Retirement Plan Participation",
subtitle = "95% Confidence Interval ")
q_g2
Logit model and calculate odd ratios
tidy_table <- tidy_table %>% mutate(Part_Retirement = case_when(Cat_retirement == "NO" ~ "0" , TRUE~ "1"), Part_Retirement =as.numeric(Part_Retirement ))
# Logtit model
Logit <- glm( Part_Retirement ~ Cat_Fin+Cat_age+jobstatus+Hhincome+marstatus,
family = binomial(link = "logit"),
data = tidy_table)
summary(Logit)
##
## Call:
## glm(formula = Part_Retirement ~ Cat_Fin + Cat_age + jobstatus +
## Hhincome + marstatus, family = binomial(link = "logit"),
## data = tidy_table)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4736 -0.8970 0.4444 0.8361 2.2063
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.88169 1.95784 -0.450 0.65247
## Cat_Finslightly important -0.44567 0.91137 -0.489 0.62483
## Cat_Finmoderate important -0.53801 0.73774 -0.729 0.46584
## Cat_Finvery important -0.33175 0.76293 -0.435 0.66368
## Cat_Finextrem important 0.38989 0.82443 0.473 0.63627
## Cat_age20s 2.59199 1.30874 1.981 0.04765 *
## Cat_age30s 2.77253 1.38421 2.003 0.04518 *
## Cat_age40s 2.77059 1.38458 2.001 0.04539 *
## Cat_age50s 3.64575 1.40883 2.588 0.00966 **
## Cat_age60s 3.66620 1.42334 2.576 0.01000 *
## Cat_age>=70 2.71593 1.50274 1.807 0.07071 .
## jobstatusFaculty 0.33505 1.25759 0.266 0.78992
## jobstatusNon-university -0.79140 1.16236 -0.681 0.49596
## jobstatusStaff -0.67297 1.24412 -0.541 0.58856
## jobstatusStudent 0.77911 1.27867 0.609 0.54232
## Hhincome$150,000-$199,999 0.21085 1.20148 0.175 0.86069
## Hhincome$20,000-$39,999 -0.95116 0.50074 -1.900 0.05750 .
## Hhincome$200,000 & above 0.64665 1.37819 0.469 0.63893
## Hhincome$40,000-$59,999 -0.17911 0.50263 -0.356 0.72159
## Hhincome$60,000-$79,999 -0.48364 0.53942 -0.897 0.36994
## Hhincome$80,000-$99,999 0.22075 0.57234 0.386 0.69972
## HhincomeBelow $20,000 -2.17140 0.53078 -4.091 4.3e-05 ***
## marstatusMarried -0.04654 0.50715 -0.092 0.92689
## marstatusOther -0.10663 1.26817 -0.084 0.93299
## marstatusPartnered -0.67002 0.58297 -1.149 0.25043
## marstatussingle -0.07409 0.53681 -0.138 0.89022
## marstatusWidowed 0.54053 0.97805 0.553 0.58050
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 500.53 on 365 degrees of freedom
## Residual deviance: 388.38 on 339 degrees of freedom
## AIC: 442.38
##
## Number of Fisher Scoring iterations: 5
# calculate odd ratios
logitor(Part_Retirement ~ Cat_Fin+Cat_age+jobstatus+Hhincome+marstatus,data = tidy_table)
## Call:
## logitor(formula = Part_Retirement ~ Cat_Fin + Cat_age + jobstatus +
## Hhincome + marstatus, data = tidy_table)
##
## Odds Ratio:
## OddsRatio Std. Err. z P>|z|
## Cat_Finslightly important 0.640395 0.583638 -0.4890 0.62483
## Cat_Finmoderate important 0.583909 0.430773 -0.7293 0.46584
## Cat_Finvery important 0.717665 0.547528 -0.4348 0.66368
## Cat_Finextrem important 1.476824 1.217538 0.4729 0.63627
## Cat_age20s 13.356301 17.479952 1.9805 0.04765 *
## Cat_age30s 15.998998 22.146043 2.0030 0.04518 *
## Cat_age40s 15.967987 22.109019 2.0010 0.04539 *
## Cat_age50s 38.311385 53.974378 2.5878 0.00966 **
## Cat_age60s 39.103065 55.656855 2.5758 0.01000 *
## Cat_age>=70 15.118595 22.719314 1.8073 0.07071 .
## jobstatusFaculty 1.398005 1.758115 0.2664 0.78992
## jobstatusNon-university 0.453209 0.526791 -0.6809 0.49596
## jobstatusStaff 0.510192 0.634740 -0.5409 0.58856
## jobstatusStudent 2.179534 2.786902 0.6093 0.54232
## Hhincome$150,000-$199,999 1.234729 1.483507 0.1755 0.86069
## Hhincome$20,000-$39,999 0.386293 0.193431 -1.8995 0.05750 .
## Hhincome$200,000 & above 1.909129 2.631151 0.4692 0.63893
## Hhincome$40,000-$59,999 0.836016 0.420208 -0.3563 0.72159
## Hhincome$60,000-$79,999 0.616538 0.332574 -0.8966 0.36994
## Hhincome$80,000-$99,999 1.247009 0.713711 0.3857 0.69972
## HhincomeBelow $20,000 0.114018 0.060519 -4.0909 4.296e-05 ***
## marstatusMarried 0.954531 0.484093 -0.0918 0.92689
## marstatusOther 0.898862 1.139912 -0.0841 0.93299
## marstatusPartnered 0.511698 0.298306 -1.1493 0.25043
## marstatussingle 0.928587 0.498475 -0.1380 0.89022
## marstatusWidowed 1.716909 1.679229 0.5527 0.58050
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Indeed, here I find statistical confirmation of some of the visual exploration.
For the age category, at the 1% significance level, the estimated odds of participating the financial retirement plan for people in 50s and 60s is 38.31 and 39.1 times higher compared to the group under 20s. For the Household income category, it indicate that the lower the income, the less likely a person would join the retirement plan, the estimated odds of participating the plan for people with HhincomeBelow 20,000 is 88.6% lower than that of the people with HhincomeBelow over 200,000. For the Marital Status, it shows that the estimated odds for Widowed person to partipate the plan is 1.7 times higher than the divorced one, however, this result is not statistically significant at 10% level.
# Save the rds file so I can reuse anything from this file in another file
save.image("Final_TA.RData")